Next steps

# Load libraries
library(RCurl)
library(tidyverse)
library(magrittr)
library(plotly)
library(imputeTS)
library(lubridate)
library(timetk)
library(ggcorrplot)
# Load data (relative file path) and view contents
df <- read.csv("../data/energy_dataset.csv")

# Formatting time variable to datetime object
# NB:  Time contains both UTC+1 and UTC+2 time-zones - date functions automatically pick this up and convert to UTC

df$time <- as_datetime(df$time)
head(df)
dim(df) # Number of rows and columns
## [1] 35064    29
summary(df) # Distribution and missingness for each variable
##       time                     generation.biomass
##  Min.   :2014-12-31 23:00:00   Min.   :  0.0     
##  1st Qu.:2016-01-01 04:45:00   1st Qu.:333.0     
##  Median :2016-12-31 10:30:00   Median :367.0     
##  Mean   :2016-12-31 10:30:00   Mean   :383.5     
##  3rd Qu.:2017-12-31 16:15:00   3rd Qu.:433.0     
##  Max.   :2018-12-31 22:00:00   Max.   :592.0     
##                                NA's   :19        
##  generation.fossil.brown.coal.lignite generation.fossil.coal.derived.gas
##  Min.   :  0.0                        Min.   :0                         
##  1st Qu.:  0.0                        1st Qu.:0                         
##  Median :509.0                        Median :0                         
##  Mean   :448.1                        Mean   :0                         
##  3rd Qu.:757.0                        3rd Qu.:0                         
##  Max.   :999.0                        Max.   :0                         
##  NA's   :18                           NA's   :18                        
##  generation.fossil.gas generation.fossil.hard.coal generation.fossil.oil
##  Min.   :    0         Min.   :   0                Min.   :  0.0        
##  1st Qu.: 4126         1st Qu.:2527                1st Qu.:263.0        
##  Median : 4969         Median :4474                Median :300.0        
##  Mean   : 5623         Mean   :4256                Mean   :298.3        
##  3rd Qu.: 6429         3rd Qu.:5839                3rd Qu.:330.0        
##  Max.   :20034         Max.   :8359                Max.   :449.0        
##  NA's   :18            NA's   :18                  NA's   :19           
##  generation.fossil.oil.shale generation.fossil.peat generation.geothermal
##  Min.   :0                   Min.   :0              Min.   :0            
##  1st Qu.:0                   1st Qu.:0              1st Qu.:0            
##  Median :0                   Median :0              Median :0            
##  Mean   :0                   Mean   :0              Mean   :0            
##  3rd Qu.:0                   3rd Qu.:0              3rd Qu.:0            
##  Max.   :0                   Max.   :0              Max.   :0            
##  NA's   :18                  NA's   :18             NA's   :18           
##  generation.hydro.pumped.storage.aggregated
##  Mode:logical                              
##  NA's:35064                                
##                                            
##                                            
##                                            
##                                            
##                                            
##  generation.hydro.pumped.storage.consumption
##  Min.   :   0.0                             
##  1st Qu.:   0.0                             
##  Median :  68.0                             
##  Mean   : 475.6                             
##  3rd Qu.: 616.0                             
##  Max.   :4523.0                             
##  NA's   :19                                 
##  generation.hydro.run.of.river.and.poundage generation.hydro.water.reservoir
##  Min.   :   0.0                             Min.   :   0                    
##  1st Qu.: 637.0                             1st Qu.:1077                    
##  Median : 906.0                             Median :2164                    
##  Mean   : 972.1                             Mean   :2605                    
##  3rd Qu.:1250.0                             3rd Qu.:3757                    
##  Max.   :2000.0                             Max.   :9728                    
##  NA's   :19                                 NA's   :18                      
##  generation.marine generation.nuclear generation.other
##  Min.   :0         Min.   :   0       Min.   :  0.00  
##  1st Qu.:0         1st Qu.:5760       1st Qu.: 53.00  
##  Median :0         Median :6566       Median : 57.00  
##  Mean   :0         Mean   :6264       Mean   : 60.23  
##  3rd Qu.:0         3rd Qu.:7025       3rd Qu.: 80.00  
##  Max.   :0         Max.   :7117       Max.   :106.00  
##  NA's   :19        NA's   :17         NA's   :18      
##  generation.other.renewable generation.solar generation.waste
##  Min.   :  0.00             Min.   :   0     Min.   :  0.0   
##  1st Qu.: 73.00             1st Qu.:  71     1st Qu.:240.0   
##  Median : 88.00             Median : 616     Median :279.0   
##  Mean   : 85.64             Mean   :1433     Mean   :269.5   
##  3rd Qu.: 97.00             3rd Qu.:2578     3rd Qu.:310.0   
##  Max.   :119.00             Max.   :5792     Max.   :357.0   
##  NA's   :18                 NA's   :18       NA's   :19      
##  generation.wind.offshore generation.wind.onshore forecast.solar.day.ahead
##  Min.   :0                Min.   :    0           Min.   :   0            
##  1st Qu.:0                1st Qu.: 2933           1st Qu.:  69            
##  Median :0                Median : 4849           Median : 576            
##  Mean   :0                Mean   : 5464           Mean   :1439            
##  3rd Qu.:0                3rd Qu.: 7398           3rd Qu.:2636            
##  Max.   :0                Max.   :17436           Max.   :5836            
##  NA's   :18               NA's   :18                                      
##  forecast.wind.offshore.eday.ahead forecast.wind.onshore.day.ahead
##  Mode:logical                      Min.   :  237                  
##  NA's:35064                        1st Qu.: 2979                  
##                                    Median : 4855                  
##                                    Mean   : 5471                  
##                                    3rd Qu.: 7353                  
##                                    Max.   :17430                  
##                                                                   
##  total.load.forecast total.load.actual price.day.ahead   price.actual   
##  Min.   :18105       Min.   :18041     Min.   :  2.06   Min.   :  9.33  
##  1st Qu.:24794       1st Qu.:24808     1st Qu.: 41.49   1st Qu.: 49.35  
##  Median :28906       Median :28901     Median : 50.52   Median : 58.02  
##  Mean   :28712       Mean   :28697     Mean   : 49.87   Mean   : 57.88  
##  3rd Qu.:32263       3rd Qu.:32192     3rd Qu.: 60.53   3rd Qu.: 68.01  
##  Max.   :41390       Max.   :41015     Max.   :101.99   Max.   :116.80  
##                      NA's   :36

Missing data

There are eight columns that contain no values (all NAs or zeroes). These are removed here.

df <- df %>% 
  select(-c(generation.fossil.coal.derived.gas,
            generation.fossil.oil.shale,
            generation.fossil.peat,
            generation.geothermal,
            generation.marine,
            generation.hydro.pumped.storage.aggregated,
            generation.wind.offshore,
            forecast.wind.offshore.eday.ahead))
colSums(is.na(df)) # Number of missing values for each remaining variable
##                                        time 
##                                           0 
##                          generation.biomass 
##                                          19 
##        generation.fossil.brown.coal.lignite 
##                                          18 
##                       generation.fossil.gas 
##                                          18 
##                 generation.fossil.hard.coal 
##                                          18 
##                       generation.fossil.oil 
##                                          19 
## generation.hydro.pumped.storage.consumption 
##                                          19 
##  generation.hydro.run.of.river.and.poundage 
##                                          19 
##            generation.hydro.water.reservoir 
##                                          18 
##                          generation.nuclear 
##                                          17 
##                            generation.other 
##                                          18 
##                  generation.other.renewable 
##                                          18 
##                            generation.solar 
##                                          18 
##                            generation.waste 
##                                          19 
##                     generation.wind.onshore 
##                                          18 
##                    forecast.solar.day.ahead 
##                                           0 
##             forecast.wind.onshore.day.ahead 
##                                           0 
##                         total.load.forecast 
##                                           0 
##                           total.load.actual 
##                                          36 
##                             price.day.ahead 
##                                           0 
##                                price.actual 
##                                           0

16 out of 22 of the remaining variables each have a maximum of 36 missing values. Looking at where these occur, we see that most missing values occur across variables. Specifically, 90% of the missing values occur over only 18 rows. The remaining 10% are spread across 28 rows.

# Rows where data is missing
df[!complete.cases(df),]
# Number of NAs per row
missing_per_row <- rowSums(is.na(df[!complete.cases(df),]))
missing_per_row
##   100   109   110   111   112   113   114   452   453   644   662   752   753 
##    14    15    15    15    15    15    15    14    14    14    15     1     1 
##   754   757   758   759   760   761   762   763   764  2259  2529  2624  2709 
##     1     1     1     1     1     1     1     1     1     1    15     1    15 
##  2914  3555  3969  6584  6587  8050 11237 11525 11527 11903 12673 13342 13392 
##     1     1    14     1    15    15     1     1     1     1     1    14     1 
## 15273 15983 16613 25165 25172 30186 30897 
##     1     1     1     1     1     1    15
sum(missing_per_row > 1)
## [1] 18
sum(missing_per_row[missing_per_row > 1])/sum(missing_per_row)
## [1] 0.9041096

We can visualise where in the series the missing values occur using the following function:

ggplot_na_distribution(df$generation.biomass)

However, this only allows us to look at one variable at a time. There are other functions in the imputeTS package which can be useful for investigating missingness in a (univariate) time series.

We have decided to interpolate the missing values using linear interpolation. The only places where this might be an issue is where are two gaps of length 6 and 8. These will be visualised below.

# Interpolate missing values using linear interpolation
df_clean <- df %>% mutate(across(generation.biomass:price.actual, .fns = na_interpolation, option = 'linear'))

# Verifying missing values have been filled
df_clean[!complete.cases(df_clean),]

Visualising interpolation

# Creating a 'missing' flag to indicate imputation
df_clean$interp_biomass <- as.integer(is.na(df$generation.biomass))

ggplot(data = df_clean[90:150,]) + geom_line(aes(x=time, y = generation.biomass, colour = interp_biomass), show.legend = FALSE) + scale_color_gradient(low="black", high="red") + labs(title = "Example of linear interpolation (in red) applied to the dataset")

What are we left with in df_clean?

35,000 obs across 20 variables, hourly from start 2015 to end 2018:

And in the other dataset, not yet loaded: hourly weather data over the same period for a number of Spanish cities

Focusing on generation

gen_data <- df_clean %>% 
  select(c(1, starts_with("generation")))

# Giving vars simpler names
names(gen_data) <- c("time",
                     "Biomass",
                    "Lignite",
                    "Gas",
                    "Hard Coal",
                    "Oil",
                    "Hydro Pumped",
                    "Hydro River",
                    "Hydro Reservoir",
                    "Nuclear",
                    "Other",
                    "Other Renewable",
                    "Solar",
                    "Waste",
                    "Wind Onshore")
# Converting to long form
long_gen_data <- gen_data  %>% 
  pivot_longer(cols = -time, names_to = "generation_source")

head(long_gen_data)

Getting statistics over entire dataset for each source

overall_stats <- long_gen_data %>%
  group_by(generation_source) %>%
  summarise(sum = sum(value, na.rm = TRUE),
            mean = mean(value, na.rm = TRUE),
            sd = sd(value, na.rm = TRUE),
            max = max(value, na.rm = TRUE)) %>%
  mutate(generation_source= fct_reorder(generation_source, desc(sum)))

sources_by_sum <- arrange(overall_stats, desc(sum))
sources_by_sum
ggplot(data = overall_stats) + geom_col(aes(x = generation_source, y = sum)) + theme(axis.text.x = element_text(angle=90)) + labs(title = "Total electricity generated by source")

sources_by_sum %>% pivot_longer(cols = c(sum, mean, sd, max), names_to = "stat", 
                                values_to = "value") %>%
  filter(stat != "sum") %>%
  ggplot() + geom_col(aes(x = generation_source, y = value, fill = stat), position = "dodge") + theme(axis.text.x = element_text(angle=90)) + labs(title = "Other overall hourly statistics of each source")

Using this information to re-factor the generation_source variable, so that plots will order the sources from most to least important.

long_gen_data$generation_source <- factor(long_gen_data$generation_source, levels =  sources_by_sum$generation_source)

Smoothing the data on a daily scale

gen_data_smoothed <- long_gen_data %>% 
  group_by(generation_source) %>% 
  summarise_by_time(time, .by = 'day', adjusted = mean(value)) 

head(gen_data_smoothed)

Looking at each series over time in isolation, with quarterly smoothing (This function is nice as it includes an interactive feature)

gen_data_smoothed %>% 
  filter(generation_source %in% sources_by_sum$generation_source[1:6]) %>% 
  plot_time_series(time, adjusted, .facet_ncol = 3,
                   .smooth_period = "3 months",
                   .title= "Six biggest generation sources by day, with 3-month smoothing line")
gen_data_smoothed %>% 
  filter(generation_source %in% sources_by_sum$generation_source[7:14]) %>% 
  plot_time_series(time, adjusted, .facet_ncol = 3,
                   .smooth_period = "3 months",
                   .title= "Seven smallest generation sources by day, with 3-month smoothing")